# Apply filters
df_filtered = df.dropna(subset=['SALARY', 'TITLE'])
# Convert salary to numeric and filter
df_filtered['SALARY'] = pd.to_numeric(df_filtered['SALARY'], errors='coerce')
df_filtered = df_filtered[df_filtered['SALARY'] > 0]
print(f"Records after filtering: {len(df_filtered):,}")
df_skills = df_filtered.copy()
# Focus on key ML/Data Science skills. We identifief some key skills for
# ML/DS roles manually.
key_skills = [
'Python (Programming Language)',
'R (Programming Language)',
'SQL (Programming Language)',
'Machine Learning',
'Data Science',
'Data Analysis',
'Statistics',
'Artificial Intelligence',
'TensorFlow',
'PyTorch (Machine Learning Library)',
'Pandas (Python Package)',
'NumPy (Python Package)',
'Scikit-Learn (Python Package)',
'Big Data',
'Apache Spark',
'Apache Hadoop',
'Amazon Web Services',
'Microsoft Azure',
'Google Cloud Platform (Gcp)',
'Data Visualization',
'Tableau (Business Intelligence Software)',
'Power BI',
'Natural Language Processing (NLP)',
'Computer Vision',
'Deep Learning'
]
print(f"Using focused {len(key_skills)} ML/Data Science skills for analysis")
# Create binary features for each key skill.
for skill in key_skills:
# Clean skill name for column naming
# Eg: R (Programming Language) --> has_r_programming_language
skill_col_name = f'has_{skill.lower().replace(" ", "_").replace("-", "_").replace("(", "").replace(")", "")}'
df_skills[skill_col_name] = (
df_skills['SKILLS_NAME'].str.contains(skill, case=False, na=False, regex=False) |
df_skills['SOFTWARE_SKILLS_NAME'].str.contains(skill, case=False, na=False, regex=False) |
df_skills['SPECIALIZED_SKILLS_NAME'].str.contains(skill, case=False, na=False, regex=False)
).astype(int)
print("Binary skill features created")
# Create ML/DS role indicator using focused skills
core_ml_skills = [
'has_machine_learning', 'has_artificial_intelligence', 'has_tensorflow', 'has_pytorch_machine_learning_library',
'has_deep_learning', 'has_natural_language_processing_nlp', 'has_computer_vision'
]
core_ds_skills = [
'has_python_programming_language', 'has_r_programming_language', 'has_statistics', 'has_data_analysis', 'has_big_data', 'has_data_visualization',
'has_data_science', 'has_pandas_python_package', 'has_numpy_python_package',
'has_scikit_learn_python_package'
]
# Role indicators
# ML roles are straighforward.
df_skills['is_ml_role'] = (
(df_skills[core_ml_skills].sum(axis=1) > 0)
).astype(int)
# R language is primarily associated with Data Science field. So,
# if job requires R language or if it has more than one data science
# skills then it is considered DS role.
df_skills['is_ds_role'] = (
df_skills['has_r_programming_language'] == 1 | (df_skills[core_ds_skills].sum(axis=1) > 1)
).astype(int)
df_skills['is_ml_ds_role'] = ((df_skills['is_ml_role'] == 1) | (df_skills['is_ds_role'] == 1)).astype(int)
# Remote work indicator
df_skills['is_remote'] = df_skills['REMOTE_TYPE'].fillna(0).astype(int)
df_skills['experience_years'] = df_skills['MIN_YEARS_EXPERIENCE'].fillna(0)
df_final = df_skills
# # Check which skills actually exist in the dataframe
# available_core_ml = [skill for skill in core_ml_skills if skill in df_skills.columns]
# available_ds_skills = [skill for skill in data_science_skills if skill in df_skills.columns]
# print(f"Available core ML skills ({len(available_core_ml)}): {available_core_ml}")
# print(f"Available DS skills ({len(available_ds_skills)}): {available_ds_skills}")
# # ML role if has core ML skills OR (Python + Statistics/Data Analysis)
# if available_core_ml:
# df_skills['has_core_ml'] = df_skills[available_core_ml].sum(axis=1) > 0
# else:
# df_skills['has_core_ml'] = False
# if available_ds_skills and 'has_python_programming_language' in df_skills.columns:
# # Check for Python + (Statistics OR Data Analysis OR Data Science)
# stats_cols = [col for col in ['has_statistics', 'has_data_analysis', 'has_data_science'] if col in df_skills.columns]
# if stats_cols:
# df_skills['has_ds_combo'] = (df_skills['has_python_programming_language'] == 1) & (df_skills[stats_cols].sum(axis=1) > 0)
# else:
# df_skills['has_ds_combo'] = False
# else:
# df_skills['has_ds_combo'] = False
# df_skills['is_ml_role'] = (df_skills['has_core_ml'] | df_skills['has_ds_combo']).astype(int)
# # Create remote work indicator
# df_skills['is_remote'] = df_skills['REMOTE_TYPE'].fillna(0).astype(int)
# df_skills['experience_years'] = df_skills['MIN_YEARS_EXPERIENCE'].fillna(0)
# df_final = df_skills
print(f"Final dataset size: {len(df_final):,}")
print(f"ML/Data Science roles identified: {df_final['is_ml_ds_role'].sum():,}")